\(~\)

library(lubridate)
library(tidyverse)
library(gridExtra)
library(grid)
library(kableExtra)
library(dplyr)
library(dummies)
library(car)
library(glmnet)
library(leaps)
library(caret)
library(pls)
library(MASS)
library(randomForest)
library(tree)
library(gbm)
library(leaps)
library(glmnet)
library(rpart)
library(rpart.plot)
library(broom)
library(corrplot)
library(FactoMineR)
library(factoextra)
library(e1071)
library(vip)

\(~\)

Reading in data

data.read <- read.csv("https://www.skra.is/library/Samnyttar-skrar-/Fyrirtaeki-stofnanir/Fasteignamat-2019/gagnasafn_ib_2018.csv", header=T, sep=";", dec=",", fileEncoding="latin1")

\(~\)

Cleaning data

Discarding unnecessary variables

data <- subset(data.read, select=c(rfastnum, kdagur, nuvirdi, teg_eign, svfn, byggar, efstah, 
                                   fjmib,lyfta, ibm2, fjhaed, fjbilsk, fjbkar, fjsturt,
                                   fjklos, fjeld, fjherb, fjstof, fjgeym, stig10, bilskurm2, 
                                   svalm2, geymm2, matssvaedi, undirmatssvaedi, ibteg))

The variables that were not mentioned in the project description were discarded. So were variables that were very similar to other variables.

\(~\)

Checking if variables are of the correct type

str(data)
## 'data.frame':    49777 obs. of  26 variables:
##  $ rfastnum       : int  10803303 10403734 10662477 10953287 10872101 10003894 10618062 10362378 10149692 10423071 ...
##  $ kdagur         : Factor w/ 1953 levels "01.01.2013","01.01.2014",..: 1795 139 1722 1801 255 1689 325 768 1618 132 ...
##  $ nuvirdi        : int  16400 27871 34592 21205 24243 21198 47331 11550 12411 25175 ...
##  $ teg_eign       : Factor w/ 12 levels "Einbýlishús",..: 6 6 6 1 6 6 1 6 1 6 ...
##  $ svfn           : int  0 0 1300 3000 1400 5200 0 0 2000 1000 ...
##  $ byggar         : Factor w/ 149 levels "(null)","1787",..: 104 110 135 96 122 90 127 75 65 119 ...
##  $ efstah         : int  4 10 3 1 3 3 2 2 2 3 ...
##  $ fjmib          : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ lyfta          : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ ibm2           : num  57.7 72.8 116.8 106.8 96 ...
##  $ fjhaed         : int  1 1 1 1 1 1 3 1 3 1 ...
##  $ fjbilsk        : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ fjbkar         : int  1 1 1 0 1 1 1 0 1 1 ...
##  $ fjsturt        : int  0 0 1 1 0 0 1 1 0 0 ...
##  $ fjklos         : int  1 1 1 1 1 1 2 1 1 1 ...
##  $ fjeld          : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ fjherb         : int  1 2 3 2 3 2 5 1 5 2 ...
##  $ fjstof         : int  1 1 1 1 1 1 2 1 1 1 ...
##  $ fjgeym         : int  1 1 1 0 1 1 4 1 0 0 ...
##  $ stig10         : num  10 10 10 10 10 10 9.9 10 10 10 ...
##  $ bilskurm2      : Factor w/ 642 levels "-70","(null)",..: 3 3 3 171 3 3 564 3 123 134 ...
##  $ svalm2         : Factor w/ 723 levels "-0,2","(null)",..: 3 617 664 3 3 3 3 3 3 549 ...
##  $ geymm2         : Factor w/ 769 levels "(null)","0","0,1",..: 55 2 2 2 27 2 2 14 2 656 ...
##  $ matssvaedi     : int  161 72 700 3000 620 5110 290 100 2050 330 ...
##  $ undirmatssvaedi: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ibteg          : int  12 12 12 11 12 11 11 12 11 12 ...

A few variables were of incorrect type. Those were changed.

\(~\)

Changing variable type

#Numerical to categorical
data <- mutate(data, rfastnum=factor(rfastnum), svfn=factor(svfn), efstah=factor(efstah), 
               lyfta=factor(lyfta), stig10=factor(stig10), matssvaedi=factor(matssvaedi), 
               undirmatssvaedi=factor(undirmatssvaedi), ibteg=factor(ibteg), svfn=factor(svfn))

#Categorical to numerical
data$bilskurm2 <- as.numeric(data$bilskurm2)
data$svalm2 <- as.numeric(data$svalm2)
data$geymm2 <- as.numeric(data$geymm2)

#Factor to date
data$kdagur <- as.Date(data$kdagur, "%d.%m.%Y")

#Fix variable byggar
data$byggar <- as.Date(data$byggar, "%Y"); data$byggar <- lubridate::year(data$byggar)

#Price in million ISK
data$nuvirdi_m <- data$nuvirdi/1000

str(data)
## 'data.frame':    49777 obs. of  27 variables:
##  $ rfastnum       : Factor w/ 41823 levels "10000034","10000075",..: 33627 16891 27721 39895 36586 165 25848 15171 6243 17659 ...
##  $ kdagur         : Date, format: "2015-01-29" "2015-02-03" ...
##  $ nuvirdi        : int  16400 27871 34592 21205 24243 21198 47331 11550 12411 25175 ...
##  $ teg_eign       : Factor w/ 12 levels "Einbýlishús",..: 6 6 6 1 6 6 1 6 1 6 ...
##  $ svfn           : Factor w/ 68 levels "0","1000","1100",..: 1 1 4 13 5 29 1 1 8 2 ...
##  $ byggar         : num  1973 1979 2004 1965 1991 ...
##  $ efstah         : Factor w/ 14 levels "0","1","2","3",..: 5 11 4 2 4 4 3 3 3 4 ...
##  $ fjmib          : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ lyfta          : Factor w/ 8 levels "0","1","2","3",..: 1 2 1 1 1 1 1 1 1 1 ...
##  $ ibm2           : num  57.7 72.8 116.8 106.8 96 ...
##  $ fjhaed         : int  1 1 1 1 1 1 3 1 3 1 ...
##  $ fjbilsk        : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ fjbkar         : int  1 1 1 0 1 1 1 0 1 1 ...
##  $ fjsturt        : int  0 0 1 1 0 0 1 1 0 0 ...
##  $ fjklos         : int  1 1 1 1 1 1 2 1 1 1 ...
##  $ fjeld          : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ fjherb         : int  1 2 3 2 3 2 5 1 5 2 ...
##  $ fjstof         : int  1 1 1 1 1 1 2 1 1 1 ...
##  $ fjgeym         : int  1 1 1 0 1 1 4 1 0 0 ...
##  $ stig10         : Factor w/ 28 levels "4","5.5","6.3",..: 28 28 28 28 28 28 27 28 28 28 ...
##  $ bilskurm2      : num  3 3 3 171 3 3 564 3 123 134 ...
##  $ svalm2         : num  3 617 664 3 3 3 3 3 3 549 ...
##  $ geymm2         : num  55 2 2 2 27 2 2 14 2 656 ...
##  $ matssvaedi     : Factor w/ 167 levels "11","20","25",..: 19 6 55 71 49 108 32 12 67 35 ...
##  $ undirmatssvaedi: Factor w/ 73 levels "0","1","2","3",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ ibteg          : Factor w/ 2 levels "11","12": 2 2 2 1 2 1 1 2 1 2 ...
##  $ nuvirdi_m      : num  16.4 27.9 34.6 21.2 24.2 ...

All variables now seemed to be of the correct type. Next we wanted to discard some useless observations and combine some categories of some of the categorical variables.

\(~\)

Discarding some observations with missing data

nrow(data)
## [1] 49777
data <- na.omit(data)
nrow(data)
## [1] 49760

A total of 17 rows had missing values and these were removed. All seemed to involve the variable “byggar”.

\(~\)

Discarding real estate outside the capital area

nrow(data)
## [1] 49760
capital <- c(0, 1000, 1100, 1300, 1400, 1604) #Numbers for capital area
data <- filter(data, svfn%in%capital)
data <- mutate(data, svfn = fct_recode(svfn, Reykjavík="0", Kópavogur="1000",
                                       Seltjarnarnes="1100", Garðabær="1300",
                                       Hafnarfjörður="1400", Mosfellsbær="1604"))
nrow(data)
## [1] 35116
outsidecapital <- c(999)      #Number for Reykjavik rural area (single property)
data <- filter(data, matssvaedi != outsidecapital)
nrow(data)
## [1] 35115

We have in this step removed 14,645 observations that don´t belong to Reykjavík capital and now approximately 35 thousand observations remain.

\(~\)

Discarding some types of properties

There are many types of properties (teg_eign categorical variable has 12 levels). Are all these categories necessary?

summary(data$teg_eign)
##    Einbýlishús   Fjölbýlishús       Gistihús       Herbergi Hótelstarfsemi 
##           3176             19              0              1              3 
##     Íbúðareign      Íbúðarhús Ósamþykkt íbúð         Parhús         Raðhús 
##          28766             47             66            780           2253 
##        Séreign     Vinnustofa 
##              2              2

We see that several categories have no or very few observations (<5).

We remove the following categories with few (<5) observations: guest house, room, hotel, private property, working area. (Gistihús, Herbergi, Hótelstarfsemi, Séreign, Vinnustofa)

teg.eign.in <- c("Einbýlishús", "Fjölbýlishús", "Íbúðareign", "Íbúðarhús", "Ósamþykkt íbúð", "Parhús", "Raðhús") 
data <- filter(data, teg_eign%in%teg.eign.in)
data <- droplevels(data) # drop unused levels in the dataset
nrow(data)
## [1] 35107

After this cleaning we have removed 8 properties and there are 35.107 observations left.

\(~\)

Translating types of properties into English

levels(data$teg_eign)
## [1] "Einbýlishús"    "Fjölbýlishús"   "Íbúðareign"     "Íbúðarhús"     
## [5] "Ósamþykkt íbúð" "Parhús"         "Raðhús"
data <- mutate(data, teg_eign = fct_recode(teg_eign, "single-family property"="Einbýlishús", 
                                           "apartment building"="Fjölbýlishús",
                                           "apartment"="Íbúðareign",
                                           "apartment building"="Íbúðarhús",
                                           "illegal apartment"="Ósamþykkt íbúð",
                                           "two-family building"="Parhús",
                                           "town-house"="Raðhús"))
levels(data$teg_eign)
## [1] "single-family property" "apartment building"    
## [3] "apartment"              "illegal apartment"     
## [5] "two-family building"    "town-house"

\(~\)

Combining categories of categorical variables

data$kdagur <- format(data$kdagur,'%Y') #Take away dates and only leave years

data$byggar_cat <- cut(data$byggar, seq(1840,2020, 5)) #Cut year built every five years

data$matssvaedi_samein <- as.factor(data$matssvaedi)
data$matssvaedi_samein <- recode(data$matssvaedi_samein, "c('600','620','630','640','650','660','670','680')='Hafnarfjörður';
                                 c('500','510','511','520','530','540')='GarðabærNorður';
                                 c('550','560')='GarðabærSuður';
                                 c('300','320','330','340','350','351')='Kópavogur';     c('800','810','820','840','290')='MosfellsbærVest/Kjalarnes';
                                 c('850')='MosfellsbærHelgafell';
                                 c('110','120','130','140','180','181')='Grafarvogur/Grafarholt';
                                 c('700')='Álftanes';
                                 c('400')='Seltjarnarnes';
                                 c('20','25','31')='MiðbærRvk';                                
                                 c('11','70','72','75')='VesturbærRvk';
                                 c('80','85','90','91','100')='AusturRvk';
                                 c('280','281','282','283','284')='AustAusturRvk';
                                 c('150','160','161','170')='Breiðholt';
                                 c('200','210','220','270')='Árbær'")

All dates for each year bought (kdagur) were combined - Or rather, dates and months were removed from kdagur so only the year remained.

Also, new variable was made for the variable year built (byggar) by combining all observations every five years.

Finally, some of the categories in the variable area (matssvaedi) were renamed and combined.

At last combine some categories for undirmatssvaedi

# group undirmatssvaedi
seeside <- c('105','106','18','49','8','53','32','34','60','33','50','10','6','54','11','61','104','47','58','9','45','51','52','57')

data$undirmatssvaedi_cat <- ifelse(data$undirmatssvaedi %in% seeside, "1", "0")                 
table(data$undirmatssvaedi_cat)
## 
##     0     1 
## 33649  1458

\(~\)

Create a new variable for price per square meter (thousand ISK/m^2)

data <- mutate(data, Price_m2 = nuvirdi/ibm2)
summary(data$Price_m2)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   74.79  285.79  344.64  361.66  421.64 8652.91
ggplot(data = data, aes(y=Price_m2)) + geom_boxplot() + theme_classic()

The median price/m2 is around 345 thousand ISK/m2.

There is one outlier with price/m2 = 8652.91. Look more closely at the outlier.

data %>% filter(Price_m2>2000)
##   rfastnum kdagur nuvirdi  teg_eign        svfn byggar efstah fjmib lyfta
## 1 10543921   2016  956146 apartment Mosfellsbær   2016      3     1     1
##    ibm2 fjhaed fjbilsk fjbkar fjsturt fjklos fjeld fjherb fjstof fjgeym
## 1 110.5      1       1      0       1      1     1      2      1      2
##   stig10 bilskurm2 svalm2 geymm2 matssvaedi undirmatssvaedi ibteg
## 1     10         3    157    401        850               0    12
##   nuvirdi_m  byggar_cat    matssvaedi_samein undirmatssvaedi_cat Price_m2
## 1   956.146 (2015,2020] MosfellsbærHelgafell                   0 8652.905
data <- filter(data, Price_m2<2000)
nrow(data)
## [1] 35106

The outlier is a 2-room apartment in Mosfellsbær with a price of 956 million ISK - very unlikely. We remove this single observation.

#Look at the distribution of price/m2 again
summary(data$Price_m2)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   74.79  285.79  344.64  361.42  421.63 1160.47
ggplot(data = data, aes(y=Price_m2)) + geom_boxplot() + theme_classic()

cleanup = theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), 
                panel.background = element_blank(), axis.line = element_line(color = "black")) #Make cleanup-code

ggplot(data = data, aes(Price_m2, fill=teg_eign)) + geom_histogram(binwidth = 10) + theme_classic() + cleanup + scale_y_continuous(expand=c(0,0), limits=c(0,1750))

Constructing descriptive plots

Year of purchase

ggplot(data, aes(x=kdagur, y=nuvirdi_m)) +
  geom_boxplot(fill='#A4A4A4', color="black") +
  theme_classic() + ggtitle("Price with outliers") +
  theme(plot.title = element_text(hjust = 0.5)) +
  xlab("Year of purchase") + ylab("Sale price (thousand ISK)") -> p1

ggplot(data, aes(x=kdagur, y=nuvirdi_m)) +
  geom_boxplot(outlier.shape=NA, fill='#A4A4A4', color="black") +
  theme_classic() + scale_y_continuous(limits = c(0, 100)) + 
  ggtitle("Price without outliers") + 
  theme(plot.title = element_text(hjust = 0.5)) +
  xlab("Year of purchase") + ylab("Sale price (thousand ISK)") -> p2

grid.arrange(p1, p2, ncol = 2)

ggplot(data, aes(x=kdagur, y=Price_m2)) +
  geom_boxplot(fill='#A4A4A4', color="black") +
  theme_classic() + ggtitle("Price/m2 with outliers") +
  theme(plot.title = element_text(hjust = 0.5)) +
  xlab("Year of purchase") + ylab("Sale price (thousand ISK)") -> p1

ggplot(data, aes(x=kdagur, y=Price_m2)) +
  geom_boxplot(outlier.shape=NA, fill='#A4A4A4', color="black") +
  theme_classic() + scale_y_continuous(limits = c(0, 1000)) + 
  ggtitle("Price/m2 without outliers") + 
  theme(plot.title = element_text(hjust = 0.5)) +
  xlab("Year of purchase") + ylab("Sale price (thousand ISK)") -> p2

grid.arrange(p1, p2, ncol = 2)

We see that real estate prices have been steadily increasing in the last few years.

\(~\)

Year of construction

ggplot(data = data, aes(byggar, fill=teg_eign)) + geom_histogram(binwidth = 2) + theme_classic() +
  labs(x="Year built", y="Number of properties", fill="Type of property") + scale_y_continuous(expand=c(0,0), limits=c(0,2000))

ggplot(data, aes(x=byggar_cat, y=Price_m2)) +
  geom_boxplot(fill='#A4A4A4', color="black") +
  theme_classic() + ggtitle("Price/m2 and year of construction") +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.text.x = element_text(angle = 90, hjust = 1)) + 
  xlab("Year of construction") + ylab("Sale price (thousand ISK)")  + 
  geom_hline(yintercept=345, linetype="dashed", color = "red", size=1)

Thus the oldest (built before 1940) and newest (built after 2010) properties are most expensive. The brand new properties built after 2015 have the highest price.

Byggar (sorted by meðalfermetraverð)

The highest price/m2 are for the years 2016-2018, and also for a very few old houses built before or around 1900. Thus the oldest (built before 1940) and newest (built after 2010) properties are most expensive.

Byggar (sorted by year of construction)

Number of constructions per period and price/m2

The years on top regarding number of constructions (numbers and price listed) were 2006 and 2007 and 2004, just before the economic “crash” (Hrunið) in 2008.

  1. 2006 992 347.3
  2. 2007 946 337.5
  3. 2004 941 348.3
  4. 2014 856 430.3
  5. 1978 768 321.2

The year 2009 just after the crash shows a very sharp downturn in the construction activity - more than 7-fold decrease. However, the price stayed similar. The years 2009 and 2010 were also slow, but then the recovery started in 2013 and really exploded in 2014 followed by some decrease in 2015.

  1. 2009 138 336.1
  2. 2011 156 333.5
  3. 2010 230 366.2
  4. 2012 190 346.6
  5. 2013 475 356.9
  6. 2015 652 434.2

\(~\)

Property size

cleanup_point = 
  theme(panel.background = element_blank(), axis.line = element_line(color = "black"), panel.grid.major = element_blank(), legend.key=element_blank()) #Cleanup for scatterplot

ggplot(data, aes(x=ibm2, y=nuvirdi, color=teg_eign)) + geom_point() + 
  labs(x="Property size", y="Sale price", color="Type of property") + 
  cleanup_point -> p1

ggplot(data, aes(x=ibm2, y=Price_m2, color=teg_eign)) + geom_point() + 
  labs(x="Property size", y="Price/m2", color="Type of property") + 
  cleanup_point -> p2

grid.arrange(p1, p2, ncol = 2)

We see that price generally increases with property size but price/m2 tends to decrease.

\(~\)

Location

ggplot(data, aes(matssvaedi, fill=teg_eign)) + geom_bar() + theme_classic() + 
  xlab("Area") + ylab("Number of properties") + labs(fill="Type of property") + scale_y_continuous(expand=c(0,0), limits=c(0,2500)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

\(~\)

After merging areas

ggplot(data, aes(matssvaedi_samein, fill=teg_eign)) + geom_bar() + theme_classic() + 
  xlab("Area") + ylab("Number of properties") + labs(fill="Type of property") +
  scale_y_continuous(expand=c(0,0), limits=c(0,6500)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

\(~\)

Location ~ price

na.omit(data) %>% 
  group_by(data$matssvaedi) %>%
  summarize(Count=n(),
            Meðalfermetraverð = round(mean(Price_m2),1), 
            Staðalfrávik = round(sd(Price_m2),1),) %>%
  mutate(Percent = round((Count/sum(Count)*100))) %>%
  arrange(desc(Meðalfermetraverð))
## # A tibble: 60 x 5
##    `data$matssvaedi` Count Meðalfermetraverð Staðalfrávik Percent
##    <fct>             <int>             <dbl>        <dbl>   <dbl>
##  1 31                   89              497.        164.        0
##  2 550                 253              468.         71.2       1
##  3 20                 1663              454.        134.        5
##  4 25                  536              451.        124         2
##  5 75                  111              435         123.        0
##  6 11                  351              429.        116         1
##  7 560                  58              422.         91.6       0
##  8 90                 1714              421.        116.        5
##  9 70                  496              418.        110         1
## 10 400                 645              414.        109.        2
## # … with 50 more rows

We see that are nr. 31 has by far the highest price/m2 (Miðbær:Suður-Þingholt). Second comes nr. 550 (Garðabær:Urriðaholt) and then nr. 20 (Miðbær:Frá Tjörn að Snorrabraut).

\(~\)

Price/m2 ~ area (matssvaedi combined)

Red line indicates median price/m2

ggplot(data, aes(x=matssvaedi_samein, y=Price_m2)) + geom_boxplot() + 
  theme_classic() + labs(x="Area", y="Price per m^2 (thousand ISK)") + ylim(0,1000) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  geom_hline(yintercept=345, linetype="dashed", color = "red", size=1)

\(~\)

Byggar ~ location (combined)

ggplot(data, aes(x=matssvaedi_samein, y=byggar)) +
  geom_boxplot(fill='#A4A4A4', color="black") +
  theme_classic() + ggtitle("Location vs. year of construction") +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.text.x = element_text(angle = 90, hjust = 1)) + 
  xlab("Location (matssvaedi)") + ylab("Year of construction") 

Some properties with really high prices in RvkMiðbær - where are these located exactly (sub-areas/undirmatssvaedi)?

data.s <- data[rev(order(data$Price_m2, na.last = FALSE)), ]
data.s[1:20,]
##       rfastnum kdagur nuvirdi               teg_eign          svfn byggar
## 28415 10805575   2017   24486              apartment     Reykjavík   1947
## 4703  10547338   2012   65519              apartment     Reykjavík   2014
## 19089 10960906   2015  142541              apartment     Reykjavík   2015
## 19090 10158390   2015  168311              apartment     Reykjavík   2015
## 31419 10553943   2017  251019 single-family property     Reykjavík   1954
## 34048 10805028   2017   23000              apartment     Reykjavík   1958
## 27218 10062144   2016   47000    two-family building     Reykjavík   1926
## 9314  10238187   2013  129149              apartment     Reykjavík   2008
## 34567 10236346   2018   44850              apartment     Reykjavík   2004
## 27515 10846742   2016   56409             town-house     Reykjavík   1948
## 32214 10506190   2017  165000              apartment     Reykjavík   2008
## 34568 10052930   2018   45784              apartment     Reykjavík   2004
## 24426 10633210   2016  170460              apartment Seltjarnarnes   2016
## 24818 10379194   2016   23344              apartment     Reykjavík   1905
## 33308 10584240   2017  125710              apartment     Reykjavík   2015
## 25776 10878324   2016  122524 single-family property     Reykjavík   1927
## 29107 10356430   2017   32986 single-family property Hafnarfjörður   1900
## 10150 10327189   2014  122766              apartment     Reykjavík   2015
## 33082 10067667   2017  192228 single-family property     Reykjavík   1935
## 31784 10768751   2017   86469              apartment     Reykjavík   1934
##       efstah fjmib lyfta  ibm2 fjhaed fjbilsk fjbkar fjsturt fjklos fjeld
## 28415      2     1     0  21.1      2       0      1       0      1     1
## 4703       7     1     3  58.8      1       1      0       1      1     1
## 19089      9     1     3 128.8      1       2      1       1      2     1
## 19090      9     1     3 165.0      1       2      1       1      2     1
## 31419      2     2     0 247.7      2       0      1       2      3     1
## 34048      4     1     1  22.7      1       0      0       1      1     1
## 27218      1     1     0  46.4      1       0      0       1      1     1
## 9314       9     1     3 128.8      1       3      1       2      2     1
## 34567      4     1     0  44.8      1       0      0       1      1     1
## 27515      1     1     0  56.6      1       0      0       1      1     1
## 32214      9     1     3 166.4      1       1      1       1      2     1
## 34568      4     1     0  46.2      1       0      0       1      1     1
## 24426      5     1     0 173.5      1       2      0       1      1     1
## 24818      2     1     0  23.8      1       0      0       1      1     1
## 33308      9     1     3 128.8      1       1      1       1      2     1
## 25776      2     2     0 127.4      1       0      1       0      1     1
## 29107      1     1     0  34.3      1       0      0       1      1     1
## 10150      9     1     3 128.8      1       2      1       1      2     1
## 33082      2     1     0 203.2      2       0      1       1      2     1
## 31784      2     1     0  91.6      1       0      1       0      1     1
##       fjherb fjstof fjgeym stig10 bilskurm2 svalm2 geymm2 matssvaedi
## 28415      1      1      1     10         3      3      2        100
## 4703       1      1      1     10         3    549    655         20
## 19089      2      1      0     10         3     85    744         20
## 19090      2      1      0     10         3     59    713         20
## 31419     10      2      1     10       286    221    194         70
## 34048      0      1      0      7         3    382      2         90
## 27218      2      1      0     10         3      3    463         20
## 9314       2      1      0     10         3     85    661         20
## 34567      1      1      0     10         3    698    296         20
## 27515      1      1      0     10         3      3      2         20
## 32214      2      1      0     10         3     59     77         20
## 34568      1      1      0     10         3      3    294         20
## 24426      3      1      1     10         3    249    388        400
## 24818      0      1      0     10         3      3     56         20
## 33308      2      1      0     10         3     85    709         20
## 25776      2      3      0     10         3      3      2         25
## 29107      2      1      0     10         3      3    148        600
## 10150      2      1      0     10         3     85    661         20
## 33082      4      3      0     10        75    550    520         31
## 31784      4      2      1     10       436    613    437         31
##       undirmatssvaedi ibteg nuvirdi_m  byggar_cat matssvaedi_samein
## 28415               0    12    24.486 (1945,1950]         AusturRvk
## 4703               47    12    65.519 (2010,2015]         MiðbærRvk
## 19089              51    12   142.541 (2010,2015]         MiðbærRvk
## 19090              51    12   168.311 (2010,2015]         MiðbærRvk
## 31419               6    11   251.019 (1950,1955]      VesturbærRvk
## 34048               0    12    23.000 (1955,1960]         AusturRvk
## 27218               0    11    47.000 (1925,1930]         MiðbærRvk
## 9314               51    12   129.149 (2005,2010]         MiðbærRvk
## 34567               0    12    44.850 (2000,2005]         MiðbærRvk
## 27515              27    11    56.409 (1945,1950]         MiðbærRvk
## 32214              51    12   165.000 (2005,2010]         MiðbærRvk
## 34568               0    12    45.784 (2000,2005]         MiðbærRvk
## 24426              45    12   170.460 (2015,2020]     Seltjarnarnes
## 24818               0    12    23.344 (1900,1905]         MiðbærRvk
## 33308              51    12   125.710 (2010,2015]         MiðbærRvk
## 25776               0    11   122.524 (1925,1930]         MiðbærRvk
## 29107               0    11    32.986 (1895,1900]     Hafnarfjörður
## 10150              51    12   122.766 (2010,2015]         MiðbærRvk
## 33082               0    11   192.228 (1930,1935]         MiðbærRvk
## 31784               0    12    86.469 (1930,1935]         MiðbærRvk
##       undirmatssvaedi_cat  Price_m2
## 28415                   0 1160.4739
## 4703                    1 1114.2687
## 19089                   1 1106.6848
## 19090                   1 1020.0667
## 31419                   1 1013.3993
## 34048                   0 1013.2159
## 27218                   0 1012.9310
## 9314                    1 1002.7096
## 34567                   0 1001.1161
## 27515                   0  996.6254
## 32214                   1  991.5865
## 34568                   0  990.9957
## 24426                   1  982.4784
## 24818                   0  980.8403
## 33308                   1  976.0093
## 25776                   0  961.7268
## 29107                   0  961.6910
## 10150                   1  953.1522
## 33082                   0  946.0039
## 31784                   0  943.9847
remove(data.s)

It appears that sub-area nr. 51 (101-Skuggahverfi-Sjávarsíða) comes up very frequently.

\(~\)

Median price/m2 for sub-area nr. 51

data %>% filter(undirmatssvaedi == "51") %>%
  summarize(Count=n(),
            Meðalfermetraverð = round(mean(Price_m2),1), 
            Staðalfrávik = round(sd(Price_m2),1),) %>%
  mutate(Percent = round((Count/sum(Count)*100))) %>%
  arrange(desc(Meðalfermetraverð))
##   Count Meðalfermetraverð Staðalfrávik Percent
## 1    79             612.6        183.9     100

So the median price/m2 for properties in this sub-area is very high, or 613 compared to 345 for the whole capital area.

\(~\)

Look more closely at sub-areas (undirmatssvaedi)

data %>% filter(undirmatssvaedi != "0") %>% 
  ggplot(aes(x=undirmatssvaedi, y=Price_m2)) + geom_boxplot() + 
  theme_classic() + labs(x="Area", y="Price per m^2 (thousand ISK)") + ylim(0,1000) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  geom_hline(yintercept=345, linetype="dashed", color = "red", size=1)

\(~\)

What are the sub-areas with the highest price/m2?

data %>% filter(undirmatssvaedi != "0") %>% 
  group_by(undirmatssvaedi) %>%
  summarize(Count=n(),
            Meðalfermetraverð = round(mean(Price_m2),1), 
            Staðalfrávik = round(sd(Price_m2),1),) %>%
  mutate(Percent = round((Count/sum(Count)*100))) %>%
  arrange(desc(Meðalfermetraverð))
## # A tibble: 67 x 5
##    undirmatssvaedi Count Meðalfermetraverð Staðalfrávik Percent
##    <fct>           <int>             <dbl>        <dbl>   <dbl>
##  1 51                 79              613.        184.        1
##  2 11                  5              564.        167.        0
##  3 57                 14              547.         78.1       0
##  4 45                 59              546.        106.        1
##  5 10                  6              519.        111.        0
##  6 9                   6              519.        136.        0
##  7 106                39              515          82.4       1
##  8 52                103              514.        117.        2
##  9 32                  7              506.        194         0
## 10 59                101              505.        103.        2
## # … with 57 more rows

The most expensive properties are in sub-areas:

  • 51: 101 - Skuggahverfi - sjávarsíða (Reykjavid mid-town seaside)
  • 11: Suðurströnd Seltjarnarness (Seltjarnarnes south seaside)
  • 57: Þorragata (close to Skerjafjörður)
  • 45: Hrólfsskálamelur (Seltjarnarnes south seaside)
  • 10: Skildinganes (Skerjafjörður seaside)
  • 9: Einimelur (Reykjavik-West)

\(~\)

Number of properties per category:

data %>% group_by(teg_eign) %>%
  summarize(Count=n(),
            MedianPriceM2 = round(median(Price_m2)),
            MeanPriceM2 = round(mean(Price_m2),1), 
            StdDev = round(sd(Price_m2),1),) %>%
  mutate(Percent = round((Count/sum(Count)*100))) %>%
  arrange(desc(MeanPriceM2)) %>% kable() %>%
 kable_styling(full_width = F, position = "left", bootstrap_options = c("striped", "hover", "condensed")) %>% 
  column_spec(1:6, width = "10em")
teg_eign Count MedianPriceM2 MeanPriceM2 StdDev Percent
apartment building 66 397 399.3 121.7 0
illegal apartment 66 356 389.8 154.8 0
apartment 28765 349 365.9 104.8 82
single-family property 3176 333 347.2 102.1 9
two-family building 780 327 341.1 88.8 2
town-house 2253 319 329.8 88.4 6
data %>%
  ggplot(aes(x=teg_eign, y=Price_m2)) + geom_boxplot() + 
  theme_classic() + labs(x="Type of property", y="Price per m^2 (thousand ISK)") + ylim(0,1200) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  geom_hline(yintercept=345, linetype="dashed", color = "red", size=1)

Look more closely at apartment buildings

data %>% filter(teg_eign == "apartment building") %>% 
  group_by(matssvaedi_samein) %>%
  summarize(Count=n(),
            MedianPriceM2 = round(median(Price_m2)),
            MeanPriceM2 = round(mean(Price_m2),1), 
            StdDev = round(sd(Price_m2),1),) %>%
  mutate(Percent = round((Count/sum(Count)*100))) %>%
  arrange(desc(MeanPriceM2)) %>% kable() %>%
 kable_styling(full_width = F, position = "left", bootstrap_options = c("striped", "hover", "condensed")) %>% 
  column_spec(1:6, width = "10em")
matssvaedi_samein Count MedianPriceM2 MeanPriceM2 StdDev Percent
MiðbærRvk 39 450 441.9 128.5 59
GarðabærSuður 9 439 433.9 24.9 14
VesturbærRvk 9 288 305.0 56.0 14
Hafnarfjörður 7 270 278.3 23.8 11
AusturRvk 1 265 265.2 NA 2
AustAusturRvk 1 259 259.2 NA 2

Look more closely at illegal apartments

data %>% filter(teg_eign == "illegal apartment") %>% 
  group_by(matssvaedi_samein) %>%
  summarize(Count=n(),
            MiðgildiFmVerðs = round(median(Price_m2)),
            Meðalfermetraverð = round(mean(Price_m2),1), 
            Staðalfrávik = round(sd(Price_m2),1),) %>%
  mutate(Percent = round((Count/sum(Count)*100))) %>%
  arrange(desc(Meðalfermetraverð)) %>% kable()
matssvaedi_samein Count MiðgildiFmVerðs Meðalfermetraverð Staðalfrávik Percent
VesturbærRvk 11 483 475.5 190.5 17
MiðbærRvk 25 382 400.4 138.5 38
Árbær 1 393 392.8 NA 2
AusturRvk 15 341 369.3 161.0 23
AustAusturRvk 7 346 359.1 153.1 11
Breiðholt 4 310 331.6 128.0 6
Hafnarfjörður 2 261 261.1 1.4 3
GarðabærNorður 1 192 191.9 NA 2

Stig10 (byggingarstig)

ggplot(data, aes(x=stig10, y=Price_m2)) + geom_boxplot() + 
theme_classic() + labs(x="stig10", y="Price per m^2 (thousand ISK)") + ylim(0,1000) +  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_hline(yintercept=345, linetype="dashed", color = "red", size=1)

Apparently stig10 4-7 are more expensive. Reduce stig10 to two variables: >=7 and >7.

data$stig10_sam <- as.factor(data$stig10)

data$stig10_sam <- recode(data$stig10_sam,
                          "c('4','5.5','7','7.1')='low';
                          c('7.6','7.9','8','8.3','8.4','8.5','8.6','8.8','8.9','9','9.1','9.2','9.3','9.4','9.5','9.6','9.7','9.8','9.9','10')='high'")

levels(data$stig10_sam)
## [1] "high" "low"
ggplot(data, aes(x=stig10_sam, y=Price_m2)) + geom_boxplot() + 
theme_classic() + labs(x="stig10", y="Price per m^2 (thousand ISK)") + ylim(0,1200) +  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_hline(yintercept=345, linetype="dashed", color = "red", size=1)

Elevators

ggplot(data, aes(x=lyfta, y=Price_m2)) + geom_boxplot() + 
theme_classic() + labs(x="Fjöldi lyfta", y="Price per m^2 (thousand ISK)") + ylim(0,1200) +  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_hline(yintercept=345, linetype="dashed", color = "red", size=1)

Variable selection

Check for correlation

data1<- data.frame(data)
data1[,c('nuvirdi','nuvirdi_m','rfastnum','Price_m2','matssvaedi','matssvaedi_samein')] <- list(NULL)

data_n <- mutate_if(data1, is.factor, ~ as.numeric(levels(.x))[.x]) #factor to numeric
data_n <- mutate_if(data_n, is.character, ~ as.numeric(levels(.x))[.x]) #character to numeric

cor<-cor(data_n,method="spearman")
round(cor,2)
##                     kdagur teg_eign svfn byggar efstah fjmib lyfta  ibm2
## kdagur                   1       NA   NA     NA     NA    NA    NA    NA
## teg_eign                NA        1   NA     NA     NA    NA    NA    NA
## svfn                    NA       NA    1     NA     NA    NA    NA    NA
## byggar                  NA       NA   NA   1.00   0.25 -0.07  0.53  0.25
## efstah                  NA       NA   NA   0.25   1.00 -0.08  0.64 -0.32
## fjmib                   NA       NA   NA  -0.07  -0.08  1.00 -0.04  0.09
## lyfta                   NA       NA   NA   0.53   0.64 -0.04  1.00 -0.05
## ibm2                    NA       NA   NA   0.25  -0.32  0.09 -0.05  1.00
## fjhaed                  NA       NA   NA  -0.11  -0.34  0.07 -0.24  0.50
## fjbilsk                 NA       NA   NA   0.51   0.40 -0.03  0.60  0.08
## fjbkar                  NA       NA   NA  -0.08  -0.12  0.05 -0.18  0.27
## fjsturt                 NA       NA   NA   0.24  -0.12  0.05  0.11  0.27
## fjklos                  NA       NA   NA   0.06  -0.34  0.14 -0.13  0.59
## fjeld                   NA       NA   NA  -0.03  -0.07  0.35 -0.03  0.12
## fjherb                  NA       NA   NA   0.06  -0.35  0.10 -0.19  0.81
## fjstof                  NA       NA   NA  -0.22  -0.30  0.14 -0.21  0.42
## fjgeym                  NA       NA   NA   0.10   0.07  0.00  0.00  0.01
## stig10                  NA       NA   NA  -0.13   0.02  0.00 -0.04 -0.07
## bilskurm2               NA       NA   NA  -0.03  -0.51  0.07 -0.29  0.57
## svalm2                  NA       NA   NA   0.22   0.34 -0.02  0.22  0.01
## geymm2                  NA       NA   NA   0.26   0.41 -0.06  0.36 -0.17
## undirmatssvaedi         NA       NA   NA   0.06   0.23 -0.01  0.19 -0.01
## ibteg                   NA       NA   NA   0.10   0.67 -0.10  0.32 -0.51
## byggar_cat              NA       NA   NA     NA     NA    NA    NA    NA
## undirmatssvaedi_cat     NA       NA   NA     NA     NA    NA    NA    NA
## stig10_sam              NA       NA   NA     NA     NA    NA    NA    NA
##                     fjhaed fjbilsk fjbkar fjsturt fjklos fjeld fjherb
## kdagur                  NA      NA     NA      NA     NA    NA     NA
## teg_eign                NA      NA     NA      NA     NA    NA     NA
## svfn                    NA      NA     NA      NA     NA    NA     NA
## byggar               -0.11    0.51  -0.08    0.24   0.06 -0.03   0.06
## efstah               -0.34    0.40  -0.12   -0.12  -0.34 -0.07  -0.35
## fjmib                 0.07   -0.03   0.05    0.05   0.14  0.35   0.10
## lyfta                -0.24    0.60  -0.18    0.11  -0.13 -0.03  -0.19
## ibm2                  0.50    0.08   0.27    0.27   0.59  0.12   0.81
## fjhaed                1.00   -0.16   0.16    0.18   0.59  0.14   0.50
## fjbilsk              -0.16    1.00  -0.11    0.10  -0.06 -0.03  -0.06
## fjbkar                0.16   -0.11   1.00   -0.45   0.18  0.08   0.32
## fjsturt               0.18    0.10  -0.45    1.00   0.34  0.09   0.15
## fjklos                0.59   -0.06   0.18    0.34   1.00  0.20   0.51
## fjeld                 0.14   -0.03   0.08    0.09   0.20  1.00   0.16
## fjherb                0.50   -0.06   0.32    0.15   0.51  0.16   1.00
## fjstof                0.39   -0.18   0.16    0.12   0.40  0.24   0.35
## fjgeym               -0.03    0.01  -0.08    0.06   0.02  0.06  -0.02
## stig10               -0.03   -0.03   0.05   -0.07  -0.06  0.00  -0.04
## bilskurm2             0.43   -0.28   0.20    0.20   0.50  0.09   0.51
## svalm2               -0.03    0.17   0.07   -0.08  -0.05  0.01   0.01
## geymm2               -0.27    0.33  -0.06   -0.06  -0.22 -0.02  -0.17
## undirmatssvaedi      -0.06    0.21  -0.10    0.05   0.00 -0.02  -0.07
## ibteg                -0.56    0.25  -0.12   -0.24  -0.55 -0.10  -0.48
## byggar_cat              NA      NA     NA      NA     NA    NA     NA
## undirmatssvaedi_cat     NA      NA     NA      NA     NA    NA     NA
## stig10_sam              NA      NA     NA      NA     NA    NA     NA
##                     fjstof fjgeym stig10 bilskurm2 svalm2 geymm2
## kdagur                  NA     NA     NA        NA     NA     NA
## teg_eign                NA     NA     NA        NA     NA     NA
## svfn                    NA     NA     NA        NA     NA     NA
## byggar               -0.22   0.10  -0.13     -0.03   0.22   0.26
## efstah               -0.30   0.07   0.02     -0.51   0.34   0.41
## fjmib                 0.14   0.00   0.00      0.07  -0.02  -0.06
## lyfta                -0.21   0.00  -0.04     -0.29   0.22   0.36
## ibm2                  0.42   0.01  -0.07      0.57   0.01  -0.17
## fjhaed                0.39  -0.03  -0.03      0.43  -0.03  -0.27
## fjbilsk              -0.18   0.01  -0.03     -0.28   0.17   0.33
## fjbkar                0.16  -0.08   0.05      0.20   0.07  -0.06
## fjsturt               0.12   0.06  -0.07      0.20  -0.08  -0.06
## fjklos                0.40   0.02  -0.06      0.50  -0.05  -0.22
## fjeld                 0.24   0.06   0.00      0.09   0.01  -0.02
## fjherb                0.35  -0.02  -0.04      0.51   0.01  -0.17
## fjstof                1.00   0.00   0.00      0.40  -0.06  -0.21
## fjgeym                0.00   1.00  -0.06     -0.06   0.05   0.03
## stig10                0.00  -0.06   1.00     -0.07  -0.02  -0.03
## bilskurm2             0.40  -0.06  -0.07      1.00  -0.11  -0.30
## svalm2               -0.06   0.05  -0.02     -0.11   1.00   0.23
## geymm2               -0.21   0.03  -0.03     -0.30   0.23   1.00
## undirmatssvaedi      -0.04   0.00   0.02     -0.11   0.06   0.13
## ibteg                -0.40   0.07   0.06     -0.62   0.28   0.40
## byggar_cat              NA     NA     NA        NA     NA     NA
## undirmatssvaedi_cat     NA     NA     NA        NA     NA     NA
## stig10_sam              NA     NA     NA        NA     NA     NA
##                     undirmatssvaedi ibteg byggar_cat undirmatssvaedi_cat
## kdagur                           NA    NA         NA                  NA
## teg_eign                         NA    NA         NA                  NA
## svfn                             NA    NA         NA                  NA
## byggar                         0.06  0.10         NA                  NA
## efstah                         0.23  0.67         NA                  NA
## fjmib                         -0.01 -0.10         NA                  NA
## lyfta                          0.19  0.32         NA                  NA
## ibm2                          -0.01 -0.51         NA                  NA
## fjhaed                        -0.06 -0.56         NA                  NA
## fjbilsk                        0.21  0.25         NA                  NA
## fjbkar                        -0.10 -0.12         NA                  NA
## fjsturt                        0.05 -0.24         NA                  NA
## fjklos                         0.00 -0.55         NA                  NA
## fjeld                         -0.02 -0.10         NA                  NA
## fjherb                        -0.07 -0.48         NA                  NA
## fjstof                        -0.04 -0.40         NA                  NA
## fjgeym                         0.00  0.07         NA                  NA
## stig10                         0.02  0.06         NA                  NA
## bilskurm2                     -0.11 -0.62         NA                  NA
## svalm2                         0.06  0.28         NA                  NA
## geymm2                         0.13  0.40         NA                  NA
## undirmatssvaedi                1.00  0.13         NA                  NA
## ibteg                          0.13  1.00         NA                  NA
## byggar_cat                       NA    NA          1                  NA
## undirmatssvaedi_cat              NA    NA         NA                   1
## stig10_sam                       NA    NA         NA                  NA
##                     stig10_sam
## kdagur                      NA
## teg_eign                    NA
## svfn                        NA
## byggar                      NA
## efstah                      NA
## fjmib                       NA
## lyfta                       NA
## ibm2                        NA
## fjhaed                      NA
## fjbilsk                     NA
## fjbkar                      NA
## fjsturt                     NA
## fjklos                      NA
## fjeld                       NA
## fjherb                      NA
## fjstof                      NA
## fjgeym                      NA
## stig10                      NA
## bilskurm2                   NA
## svalm2                      NA
## geymm2                      NA
## undirmatssvaedi             NA
## ibteg                       NA
## byggar_cat                  NA
## undirmatssvaedi_cat         NA
## stig10_sam                   1
corrplot(cor, method="circle")

So, order by the coefficients,

=> ibm2(property area) is correlated with fjherb(no of rooms), fjklos(no of toilets), bilskurm2(garage area), ibteg(type of property), fjhaed(no of floors in property),

=> lyfta(whether there is elavator or not) is correlated with fjbilsk(no of garage), efstah(wheter the room is on top floor)

==> We decide to remove fjherb, fjklos, bilskurm2, ibteg, fjhaed, fjbilsk, efstah.

\(~\)

Remove variables that have clear correlations with other variables

We remove the following variables that have clear correlations with other variables: - fjherb, fjklos, bilskurm2, ibteg, fjhaed, fjbilsk, efstah - rfastnum (not necessary for predictions)

Make a smaller dataset with fewer variables for prediction models

data.small <- subset(data, select = c(kdagur, nuvirdi, teg_eign, svfn, byggar, fjmib, lyfta, ibm2, fjbkar, fjsturt, fjeld, fjstof, fjgeym, svalm2, geymm2, undirmatssvaedi_cat, matssvaedi_samein, byggar_cat))

PCA

data_n[,c('fjherb','fjklos','bilskurm2','ibteg','fjhaed','fjbilsk','efstah')] <- list(NULL) #Make a smaller dataset for PCA
data_n <- dummy.data.frame(data_n, names=c("kdagur", "teg_eign", "svfn", "byggar_cat", "stig10_sam","undirmatssvaedi_cat"))
pr.out  = prcomp(scale(data_n), scale=TRUE)
summary(pr.out)
## Importance of components:
##                          PC1    PC2    PC3     PC4     PC5     PC6     PC7
## Standard deviation     1.495 1.3704 1.1732 1.07568 1.02852 0.97827 0.93961
## Proportion of Variance 0.172 0.1444 0.1059 0.08901 0.08137 0.07362 0.06791
## Cumulative Proportion  0.172 0.3165 0.4224 0.51137 0.59274 0.66635 0.73427
##                            PC8     PC9    PC10   PC11    PC12    PC13
## Standard deviation     0.90335 0.85862 0.80242 0.7631 0.62816 0.52956
## Proportion of Variance 0.06277 0.05671 0.04953 0.0448 0.03035 0.02157
## Cumulative Proportion  0.79704 0.85375 0.90328 0.9481 0.97843 1.00000
pve = 100*pr.out$sdev^2/sum(pr.out$sdev^2)
plot(pve, type="o", ylab="PVE", xlab="Principal Component", col="blue")

We can see that the first 5 PC make up 59.3% of the total variance explained. However, when we look at the graph, we see that while each of the first four principal components explain a substantial amount of variance, there is a marked decrease in the variance explained by further principal components. This suggests that there may be little benefit to examining more than four or so principal components.

#Loadings of PC
rotation <- pr.out$rotation
rotation[,1:5]
##                          PC1         PC2          PC3          PC4
## byggar           0.296675974 -0.41955135  0.209302477  0.281332856
## fjmib           -0.257443186 -0.16794599  0.179059154 -0.550035971
## lyfta            0.376085160 -0.34399854  0.121579700 -0.028131624
## ibm2            -0.371259765 -0.40443305  0.107081114  0.427272074
## fjbkar          -0.236194611  0.17175819  0.608638677  0.308102210
## fjsturt         -0.080964831 -0.50551013 -0.427932717  0.053958048
## fjeld           -0.279527000 -0.23922999  0.219641891 -0.496196023
## fjstof          -0.465820449 -0.21910120  0.080883517  0.072169317
## fjgeym          -0.008037599 -0.15415402 -0.038724965 -0.195433997
## stig10          -0.070018682  0.13483643  0.067734788  0.038648519
## svalm2           0.185924515 -0.07307232  0.439203430 -0.022871827
## geymm2           0.359134396 -0.09767168  0.300867382 -0.213078960
## undirmatssvaedi  0.201128367 -0.25839578  0.004657604 -0.009390208
##                          PC5
## byggar          -0.073625879
## fjmib            0.157588399
## lyfta            0.240778902
## ibm2            -0.019960888
## fjbkar           0.004938184
## fjsturt         -0.024584142
## fjeld            0.047385796
## fjstof           0.026990280
## fjgeym          -0.571685683
## stig10           0.563602827
## svalm2          -0.264692312
## geymm2          -0.040416003
## undirmatssvaedi  0.437722989

Looking at the loadings of the first 5 PC (loadings <-0.4 or >0.4), we can see that:

  • PC1: (-) fjstof contributes negatively

  • PC2: (-) byggar, fjsturt, ibm2 contribute negatively

  • PC3: (+) fjbkar, svalm2 contribute positively; (-) fjsturt contributes negatively

  • PC4: (+) ibm2 contributes positively; (-) fjmib, fjeld contribute negatively

  • PC5: (+) stig10, undirmatssvaedi contribute positively; (-) fjgeym contributes negatively

\(~\)

Predicting sale prices (nuvirdi)

#Create a training and test set
set.seed(3)
ind <- sample(nrow(data.small),nrow(data.small)/2)
train <- data.small[ind,]
test <- data.small[-ind,]

Simple linear model

Using most numerical variables and small data set (numerical variables)

lm <- lm(nuvirdi ~ svfn + byggar + fjmib + lyfta + ibm2 + fjbkar + fjsturt + fjeld + fjstof+ fjgeym + svalm2 + geymm2, data=data.small)

tidy(lm)
## # A tibble: 23 x 5
##    term               estimate std.error statistic   p.value
##    <chr>                 <dbl>     <dbl>     <dbl>     <dbl>
##  1 (Intercept)       -10236.     6135.       -1.67 9.52e-  2
##  2 svfnKópavogur       -206.      166.       -1.24 2.15e-  1
##  3 svfnSeltjarnarnes   8847.      425.       20.8  1.41e- 95
##  4 svfnGarðabær        4279.      239.       17.9  2.89e- 71
##  5 svfnHafnarfjörður  -3896.      175.      -22.2  8.86e-109
##  6 svfnMosfellsbær    -1482.      327.       -4.53 5.94e-  6
##  7 byggar                 4.36      3.09      1.41 1.58e-  1
##  8 fjmib               9474.      742.       12.8  2.91e- 37
##  9 lyfta1              1015.      183.        5.55 2.84e-  8
## 10 lyfta2              2322.      250.        9.29 1.70e- 20
## # … with 13 more rows
plot_coeffs <- function(lm) {
  coeffs <- coefficients(lm)
  mp <- barplot(coeffs, col="#3F97D0", xaxt='n', 
                main="Regression Coefficients")
  lablist <- names(coeffs)
  text(mp, par("usr")[3], labels = lablist, srt = 45, 
       adj = c(1.1,1.1), xpd = TRUE, cex=0.6)
}

plot_pval <- function(lm) {
  p_values <- coef(summary(lm))[,4]
  mp <- barplot(p_values, col="#3F97D0", xaxt='n', 
                main="P-values")
  lablist <- names(p_values)
  text(mp, par("usr")[3], labels = lablist, srt = 45, 
       adj = c(1.1,1.1), xpd = TRUE, cex=0.6)
}

par(mfrow =c(2,1))
plot_coeffs(lm)
plot_pval(lm)

Effects of variables on price (nuvirdi) in model:

  • Town Kópavogur does not differ from Reykjavik (baseline)
  • Towns Hafnarfjörður and Mosfellsbær have negative effects (cheaper).
  • Towns Garðabær and Seltjarnarnes have positive effects (expensive).
  • Byggar has no effect.
  • Lyfta has a positive effect.
  • Fjbkar and fjeld have negative effects.
  • Fjsturt, fjstof and fjgeym have positive effects.
  • Svalm2 has a negative effect.
  • Geymm2 has a positive effect.

Using most numerical variables on training set and then fit a prediction model on test set

lm2 <- lm(nuvirdi ~ svfn + byggar + fjmib + lyfta + ibm2 + fjbkar + fjsturt + fjeld + fjstof+ fjgeym + svalm2 + geymm2, data=train)

#Calculate test MSE
lm.pred = predict(lm2,test)
mean((lm.pred-test$nuvirdi)^2)
## [1] 112588388

\(~\)

General linear model

Using certain groups of variables and small data set

#Exclude some variables that include svaedi and/or have many categories
glm.fit <- glm(nuvirdi ~. - kdagur - matssvaedi_samein -byggar_cat, data=data.small)

tidy(glm.fit)
## # A tibble: 29 x 5
##    term                        estimate std.error statistic   p.value
##    <chr>                          <dbl>     <dbl>     <dbl>     <dbl>
##  1 (Intercept)                   -4927.     6058.    -0.813 4.16e-  1
##  2 teg_eignapartment building    -1228.     1305.    -0.941 3.47e-  1
##  3 teg_eignapartment             -6572.      266.   -24.7   1.69e-133
##  4 teg_eignillegal apartment    -10896.     1302.    -8.37  5.88e- 17
##  5 teg_eigntwo-family building   -2930.      417.    -7.03  2.05e- 12
##  6 teg_eigntown-house            -5138.      291.   -17.7   1.71e- 69
##  7 svfnKópavogur                  -321.      163.    -1.98  4.82e-  2
##  8 svfnSeltjarnarnes              7820.      417.    18.8   3.78e- 78
##  9 svfnGarðabær                   3800.      235.    16.1   2.12e- 58
## 10 svfnHafnarfjörður             -4917.      173.   -28.4   6.14e-175
## # … with 19 more rows
plot_coeffs <- function(glm.fit) {
  coeffs <- coefficients(glm.fit)
  mp <- barplot(coeffs, col="#3F97D0", xaxt='n', 
                main="Regression Coefficients")
  lablist <- names(coeffs)
  text(mp, par("usr")[3], labels = lablist, srt = 45, 
       adj = c(1.1,1.1), xpd = TRUE, cex=0.6)
}

plot_pval <- function(glm.fit) {
  p_values <- coef(summary(glm.fit))[,4]
  mp <- barplot(p_values, col="#3F97D0", xaxt='n', 
                main="P-values")
  lablist <- names(p_values)
  text(mp, par("usr")[3], labels = lablist, srt = 45, 
       adj = c(1.1,1.1), xpd = TRUE, cex=0.6)
}

par(mfrow =c(2,1))
plot_coeffs(glm.fit)
plot_pval(glm.fit)

Effects of variable on price:

  • Towns: Again Seltjarnarnes and Garðabær have positive effects compared to Reykjavik
  • Towns: Again Hafnarfjörður and Mosfellsbær have negative effects compared to Rvk
  • Byggar now has a small posive effect
  • Lyfta >0 has a positive effect compared to no Lyfta (Lyfta = 0)
  • Fjmib has a large positive effect
  • Fjbkar and fjeld have negative effects
  • Fjsturt, fjstof and fjgeym have positive effects
  • Geymm2 has a positive effect
  • Teg_eign: all categories had negative effects compared to “single-family property” (baseline), especially apartments
  • Stig10: low building stage has a positive effect
  • Undirmatssvaedi_cat: has a positive effect

Summary:

  • Good: Single-family properties, low building stage
  • Good: Seltjarnarnes, Garðabær, Seaside sub-area
  • Bad: Hafnarfjörður, Mosfellsbær
  • Good: Elevators, many apartments per building
  • Good: Many showers, living rooms, storage rooms, large storage rooms
  • Bad: Many bathtubs, many kitchens
#General linear model to predict price based on area and byggar categories
glm.fit2 <- glm(nuvirdi ~ matssvaedi_samein, data=data.small)

tidy(glm.fit2)
## # A tibble: 15 x 5
##    term                               estimate std.error statistic  p.value
##    <chr>                                 <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept)                          41102.      930.    44.2   0.      
##  2 matssvaedi_sameinÁrbær               -7630.     1014.    -7.53  5.31e-14
##  3 matssvaedi_sameinAustAusturRvk       -1769.     1029.    -1.72  8.57e- 2
##  4 matssvaedi_sameinAusturRvk           -7252.      955.    -7.59  3.27e-14
##  5 matssvaedi_sameinBreiðholt          -12398.      976.   -12.7   6.41e-37
##  6 matssvaedi_sameinGarðabærNorður       8550.     1006.     8.50  2.01e-17
##  7 matssvaedi_sameinGarðabærSuður       11669.     1322.     8.83  1.13e-18
##  8 matssvaedi_sameinGrafarvogur/Graf…   -5113.      971.    -5.26  1.41e- 7
##  9 matssvaedi_sameinHafnarfjörður       -6562.      960.    -6.83  8.34e-12
## 10 matssvaedi_sameinKópavogur           -2301.      954.    -2.41  1.58e- 2
## 11 matssvaedi_sameinMiðbærRvk           -3560.      992.    -3.59  3.33e- 4
## 12 matssvaedi_sameinMosfellsbærHelga…    -535.     1538.    -0.348 7.28e- 1
## 13 matssvaedi_sameinMosfellsbærVest/…    -871.     1062.    -0.820 4.12e- 1
## 14 matssvaedi_sameinSeltjarnarnes       10687.     1136.     9.41  5.35e-21
## 15 matssvaedi_sameinVesturbærRvk        -3412.      994.    -3.43  5.96e- 4
plot_coeffs <- function(glm.fit2) {
  coeffs <- coefficients(glm.fit2)
  mp <- barplot(coeffs, col="#3F97D0", xaxt='n', 
                main="Regression Coefficients")
  lablist <- names(coeffs)
  text(mp, par("usr")[3], labels = lablist, srt = 45, 
       adj = c(1.1,1.1), xpd = TRUE, cex=0.6)
}

plot_pval <- function(glm.fit2) {
  p_values <- coef(summary(glm.fit2))[,4]
  mp <- barplot(p_values, col="#3F97D0", xaxt='n', 
                main="P-values")
  lablist <- names(p_values)
  text(mp, par("usr")[3], labels = lablist, srt = 45, 
       adj = c(1.1,1.1), xpd = TRUE, cex=0.6)
}

par(mfrow =c(2,1))
plot_coeffs(glm.fit2)
plot_pval(glm.fit2)

Areas comapared to AustAusturRvk

  • Again Garðabær and Seltjarnarnes have positive effects (both North and South)
  • Breiðholt and Árbær have the largest negative effects, as well as Reykjavík-East and Hafnarfjörður
  • Álftanes, Kópavogur and Mosfellsbær have no effects

Performing on training set and then fit a prediction model on test set

#Exclude some variables that include svaedi and/or have many categories
glm.fit4 <- glm(nuvirdi ~. - kdagur - matssvaedi_samein -byggar_cat, data=train)

#Calculate test MSE
glm.fit4$xlevels[["byggar_cat"]] <- union(glm.fit4$xlevels[["byggar_cat"]], levels(test$byggar_cat))
glm.pred = predict(glm.fit4,newdata =test)
mean((glm.pred-test$nuvirdi)^2)
## [1] 107962700

\(~\)

The Lasso with selected variables (remove variables with many categories)

set.seed(2)
#Creata an x matrix for the model fit and a lambda grid
x <- model.matrix(nuvirdi ~ . -nuvirdi -lyfta -matssvaedi_samein -byggar_cat, data.small)
y <- data.small$nuvirdi
lambda <- 10^seq(10, -2, length = 100)

#Split the data into training and validation sets
train.l = sample(nrow(x), nrow(x)/2)
test.l = (-train.l)
ytest = y[test.l]

#Perform cross-validation to find the optimal lambda value for lasso testing and plot lambda vs. MSE
cv.out <- cv.glmnet(x[train.l,], y[train.l], alpha = 1)
bestlam <- cv.out$lambda.min
bestlam
## [1] 13.45456
plot(cv.out)

#Fit a lasso model using the training data set and predict the best model using the test set. 
lasso.mod <- glmnet(x[train.l,], y[train.l], alpha = 1, lambda = bestlam)
lasso.pred <- predict(lasso.mod, s = bestlam, newx = x[test.l,])

#Calculage the MSE
mean((lasso.pred-ytest)^2)
## [1] 67250135
#Find the coefficients for the best model defined by lasso.
lasso.coef  <- predict(lasso.mod, type = 'coefficients', s = bestlam)
lasso.coef %>% tidy()
##                            row column         value
## 1                  (Intercept)      1 -2.073986e+04
## 2                   kdagur2013      1  1.837146e+03
## 3                   kdagur2014      1  4.445607e+03
## 4                   kdagur2015      1  7.139038e+03
## 5                   kdagur2016      1  1.147419e+04
## 6                   kdagur2017      1  1.875075e+04
## 7                   kdagur2018      1  1.957934e+04
## 8   teg_eignapartment building      1  5.408863e+02
## 9            teg_eignapartment      1 -6.365816e+03
## 10   teg_eignillegal apartment      1 -1.059803e+04
## 11 teg_eigntwo-family building      1 -3.390557e+03
## 12          teg_eigntown-house      1 -5.097201e+03
## 13               svfnKópavogur      1 -1.576999e+02
## 14           svfnSeltjarnarnes      1  8.214351e+03
## 15                svfnGarðabær      1  3.194727e+03
## 16           svfnHafnarfjörður      1 -5.057519e+03
## 17             svfnMosfellsbær      1 -2.792108e+03
## 18                      byggar      1  1.125526e+01
## 19                       fjmib      1  6.896599e+03
## 20                        ibm2      1  2.433579e+02
## 21                      fjbkar      1 -7.598296e+02
## 22                     fjsturt      1  2.023552e+03
## 23                       fjeld      1 -2.691056e+03
## 24                      fjstof      1  1.050546e+03
## 25                      fjgeym      1  4.654945e+02
## 26                      svalm2      1 -2.053533e-01
## 27                      geymm2      1  2.565614e+00
## 28        undirmatssvaedi_cat1      1  1.206995e+04
coef(lasso.mod, s = bestlam) %>%
  broom::tidy() %>%
  filter(row != "(Intercept)") %>%
  top_n(20, wt = abs(value)) %>%
  ggplot(aes(value, reorder(row, value), color = value > 0)) +
  geom_point(show.legend = FALSE) +
  ggtitle("Top 20 influential variables (lasso penalty)") +
  xlab("Coefficient") +
  ylab(NULL) + theme_bw()

The Lasso is optimal when around 22 variables are included.

The most influential variables are:

  • positive effect: kdagur (2018, 2017, 2016, 2015) and towns(Seltjarnarnes, Garðabær) and seaside subareas (undirmatssvaedi)
  • negative effect: property type (illegal apartments, apartments, town-house), towns (Hafnarfjörður, Mosfellsbær)

\(~\)

Fix with train and test

Forward selection:

regfit.fwd = regsubsets (nuvirdi ~., data = train, method = "forward", nvmax = 19)
## Reordering variables and trying again:
reg.summary.fwd <- summary(regfit.fwd)

reg.summary.fwd$adjr2
##  [1] 0.5926329 0.6722047 0.6984602 0.7194390 0.7377669 0.7508385 0.7661221
##  [8] 0.7777080 0.7850158 0.7897722 0.7939195 0.7982194 0.8030102 0.8068853
## [15] 0.8096696 0.8123267 0.8147420 0.8176396 0.8201330 0.8226176
which.max(reg.summary.fwd$adjr2)
## [1] 20
plot(reg.summary.fwd$adjr2, xlab = "Number of Variables", ylab="Adj R^2", type = "b", col="blue")

Forward selection: Using all variables gives the best prediction.

#Test set
regfit.fwd2 = regsubsets (nuvirdi ~., data = test, method = "forward", nvmax = 19)
## Reordering variables and trying again:
reg.summary.fwd2 <- summary(regfit.fwd2)

reg.summary.fwd2$adjr2
##  [1] 0.5878741 0.6686811 0.6946132 0.7114390 0.7284384 0.7407086 0.7557391
##  [8] 0.7658533 0.7724712 0.7774559 0.7827701 0.7878955 0.7924211 0.7961447
## [15] 0.7994310 0.8027678 0.8056707 0.8084389 0.8112266 0.8134964
which.max(reg.summary.fwd2$adjr2)
## [1] 20
plot(reg.summary.fwd2$adjr2, xlab = "Number of Variables", ylab="Adj R^2", type = "b", col="blue")

test.mat=model.matrix (nuvirdi~.,data=test)
val.errors =rep(NA ,19)
  for(i in 1:19){
  coefi = coef(regfit.fwd2,id=i)
  pred=test.mat[,names(coefi)]%*% coefi
  val.errors[i]= mean((test$nuvirdi-pred)^2) }
val.errors
##  [1] 125343020 100760810  92869049  87747279  82573277  78837825  74263553
##  [8]  71184433  70687517  70687517  70687517  69259069  69200241  67967624
## [15]  67901073  67244432  66276395  66275311  66054048
which.min (val.errors )
## [1] 19

\(~\)

calculate test MSE

Tree based methods

Regression tree

tree <- tree(formula = nuvirdi ~. - byggar_cat, data = data.small)
tree
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 35106 1.053e+13  36860  
##    2) ibm2 < 122.85 26510 2.706e+12  30510  
##      4) kdagur < 2015.5 16101 1.046e+12  26490  
##        8) ibm2 < 92.75 9973 3.667e+11  22990 *
##        9) ibm2 > 92.75 6128 3.597e+11  32170 *
##      5) kdagur > 2015.5 10409 9.975e+11  36730  
##       10) ibm2 < 89.75 5969 2.967e+11  32130 *
##       11) ibm2 > 89.75 4440 4.046e+11  42910 *
##    3) ibm2 > 122.85 8596 3.453e+12  56460  
##      6) ibm2 < 197.55 6861 1.632e+12  52190  
##       12) kdagur < 2015.5 4089 6.881e+11  46300  
##         24) matssvaedi_samein: Álftanes,Árbær,AustAusturRvk,AusturRvk,Breiðholt,GarðabærNorður,Grafarvogur/Grafarholt,Hafnarfjörður,Kópavogur,MosfellsbærHelgafell,MosfellsbærVest/Kjalarnes 3577 3.803e+11  44330 *
##         25) matssvaedi_samein: GarðabærSuður,MiðbærRvk,Seltjarnarnes,VesturbærRvk 512 1.964e+11  60100 *
##       13) kdagur > 2015.5 2772 5.932e+11  60870 *
##      7) ibm2 > 197.55 1735 1.201e+12  73350  
##       14) matssvaedi_samein: Álftanes,Árbær,AustAusturRvk,Breiðholt,Grafarvogur/Grafarholt,Hafnarfjörður,Kópavogur,MosfellsbærHelgafell,MosfellsbærVest/Kjalarnes 1169 3.845e+11  66010 *
##       15) matssvaedi_samein: AusturRvk,GarðabærNorður,GarðabærSuður,MiðbærRvk,Seltjarnarnes,VesturbærRvk 566 6.232e+11  88520  
##         30) ibm2 < 327.15 528 3.647e+11  84270 *
##         31) ibm2 > 327.15 38 1.164e+11 147600 *
summary(tree)
## 
## Regression tree:
## tree(formula = nuvirdi ~ . - byggar_cat, data = data.small)
## Variables actually used in tree construction:
## [1] "ibm2"              "kdagur"            "matssvaedi_samein"
## Number of terminal nodes:  10 
## Residual mean deviance:  98680000 = 3.463e+12 / 35100 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -97650.0  -5637.0   -693.7      0.0   4518.0 196300.0
plot(tree)
text(tree)

tree.pred = predict(tree, newdata=test)
mean((tree.pred-test$nuvirdi)^2)
## [1] 101173719

In this regression tree, three variables were used for tree construction:

  • imb2
  • kdagur
  • matssvaedi_samein:

  • AustAusturRvk, Álftanes, Árbær, Breiðholt, Grafarvogur/Grafarholt, Hafnarfjörður, Kópavogur, MosfellsbærHelgafell, MosfellsbærVest/Kjalarnes

vs.

  • AusturRvk, GarðabærNorður, GarðabærSuður, MiðbærRvk, Seltjarnarnes, VesturbærRvk

\(~\)

Use a different function for building and drawing a decision tree (use all variables in data.small):

# Create a decision tree model

data.small$price_cat  <- ifelse(data.small$nuvirdi>median(data.small$nuvirdi), "1", "0")
data.small <- mutate(data.small, price_cat=fct_recode(price_cat, Low="0", High="1"))

data.small$price_cat  <- ifelse(data.small$nuvirdi > median(data.small$nuvirdi), "1", "0")
data.small <- mutate(data.small, price_cat = fct_recode(price_cat, Low ="0", High="1"))
tree2 <- rpart(price_cat ~ .-nuvirdi, data = data.small, cp=.02)
summary(tree2)
## Call:
## rpart(formula = price_cat ~ . - nuvirdi, data = data.small, cp = 0.02)
##   n= 35106 
## 
##           CP nsplit rel error    xerror        xstd
## 1 0.56819736      0 1.0000000 1.0176618 0.005336626
## 2 0.07862352      1 0.4318026 0.4322584 0.004393737
## 3 0.03048086      2 0.3531791 0.3532931 0.004070990
## 4 0.02000000      3 0.3226983 0.3228122 0.003927262
## 
## Variable importance
##              ibm2          teg_eign            kdagur            fjstof 
##                46                16                13                11 
## matssvaedi_samein              svfn           fjsturt            byggar 
##                 4                 4                 4                 1 
##        byggar_cat 
##                 1 
## 
## Node number 1: 35106 observations,    complexity param=0.5681974
##   predicted class=Low   expected loss=0.4999715  P(node) =1
##     class counts: 17554 17552
##    probabilities: 0.500 0.500 
##   left son=2 (20175 obs) right son=3 (14931 obs)
##   Primary splits:
##       ibm2     < 101.85 to the left,  improve=5796.617, (0 missing)
##       teg_eign splits as  RRLLRR,     improve=2900.233, (0 missing)
##       kdagur   splits as  LLLLRRR,    improve=1968.250, (0 missing)
##       fjstof   < 1.5    to the left,  improve=1364.019, (0 missing)
##       fjsturt  < 0.5    to the left,  improve=1103.771, (0 missing)
##   Surrogate splits:
##       teg_eign          splits as  RRLLRR, agree=0.732, adj=0.371, (0 split)
##       fjstof            < 1.5    to the left,  agree=0.683, adj=0.254, (0 split)
##       matssvaedi_samein splits as  RLLLLRRLLRLLRRL, agree=0.616, adj=0.096, (0 split)
##       svfn              splits as  LRRRLR, agree=0.615, adj=0.095, (0 split)
##       fjsturt           < 1.5    to the left,  agree=0.614, adj=0.092, (0 split)
## 
## Node number 2: 20175 observations,    complexity param=0.07862352
##   predicted class=Low   expected loss=0.2527881  P(node) =0.5746881
##     class counts: 15075  5100
##    probabilities: 0.747 0.253 
##   left son=4 (16129 obs) right son=5 (4046 obs)
##   Primary splits:
##       kdagur            splits as  LLLLLRR, improve=1766.4300, (0 missing)
##       ibm2              < 79.65  to the left,  improve= 634.8837, (0 missing)
##       byggar            < 2013.5 to the left,  improve= 544.7991, (0 missing)
##       byggar_cat        splits as  R--L---LRL-LLLLLLLLLLLLLLLLLLLLLLLLR, improve= 426.0349, (0 missing)
##       matssvaedi_samein splits as  RLRRLRRRLRRRRRR, improve= 237.6448, (0 missing)
##   Surrogate splits:
##       byggar            < 2016.5 to the left,  agree=0.812, adj=0.061, (0 split)
##       byggar_cat        splits as  L--L---LLL-LLLLLLLLLLLLLLLLLLLLLLLLR, agree=0.805, adj=0.025, (0 split)
##       matssvaedi_samein splits as  LLLLLLRLLLLRLLL, agree=0.803, adj=0.017, (0 split)
## 
## Node number 3: 14931 observations
##   predicted class=High  expected loss=0.1660304  P(node) =0.4253119
##     class counts:  2479 12452
##    probabilities: 0.166 0.834 
## 
## Node number 4: 16129 observations
##   predicted class=Low   expected loss=0.1479943  P(node) =0.4594371
##     class counts: 13742  2387
##    probabilities: 0.852 0.148 
## 
## Node number 5: 4046 observations,    complexity param=0.03048086
##   predicted class=High  expected loss=0.3294612  P(node) =0.115251
##     class counts:  1333  2713
##    probabilities: 0.329 0.671 
##   left son=10 (1257 obs) right son=11 (2789 obs)
##   Primary splits:
##       ibm2              < 67.35  to the left,  improve=535.95330, (0 missing)
##       byggar_cat        splits as  R------LR--LRRLLLLRLLLRRRLLLLRRRRRRR, improve=180.51100, (0 missing)
##       byggar            < 1989.5 to the left,  improve=175.87460, (0 missing)
##       matssvaedi_samein splits as  RLRRLRRRRRRRRRR, improve=129.97060, (0 missing)
##       geymm2            < 590.5  to the left,  improve= 61.00849, (0 missing)
##   Surrogate splits:
##       byggar_cat        splits as  R------LR--LRLRLLLRLLRRRRRRRRRRRRRRR, agree=0.716, adj=0.085, (0 split)
##       byggar            < 1943.5 to the left,  agree=0.714, adj=0.080, (0 split)
##       matssvaedi_samein splits as  RRRRRRRRRRLRRRR, agree=0.693, adj=0.013, (0 split)
##       teg_eign          splits as  RLRLRR, agree=0.693, adj=0.010, (0 split)
##       fjstof            < 0.5    to the left,  agree=0.692, adj=0.007, (0 split)
## 
## Node number 10: 1257 observations
##   predicted class=Low   expected loss=0.2871917  P(node) =0.03580585
##     class counts:   896   361
##    probabilities: 0.713 0.287 
## 
## Node number 11: 2789 observations
##   predicted class=High  expected loss=0.156687  P(node) =0.07944511
##     class counts:   437  2352
##    probabilities: 0.157 0.843
# Visualize the decision tree with rpart.plot
rpart.plot(tree2, box.palette="RdBu", shadow.col="gray", nn=TRUE)

Using this tree function the most important variables were:

  • ibm2: 46%
  • teg_eign: 16%
  • kdagur: 13%
  • fjstof: 11%
  • fjsturt: 4%
  • matssvaedi_samein: 4%
  • svfn: 4%
  • byggar: 1%

Random Forest method

{r, warning=F, message=F} set.seed (1) rf =randomForest(nuvirdi~.,data=train, mtry=4, importance =TRUE) rf.pred = predict(rf,newdata =test) mean((rf.pred-test$nuvirdi)^2) vip(rf, num_features = 25, bar = FALSE) + ggtitle(“Variable importance”) + theme_bw()

\(~\)

Support Vector Machine

SVM linear

#tune.out=tune(svm ,nuvirdi~., data=train,kernel ="linear", ranges =list(cost=c(1,2, 5,10,100), gamma=c(2,3,4) ))
svm.fit =svm(nuvirdi~., data=train, kernel ="linear", cost =2)
svmpred = predict(svm.fit,test)
mean((svmpred-test$nuvirdi)^2)
## [1] 55267350

Most optimal model

Among the methods above, we can see that the random forest gives the smallest test MSE. And that the most important variables among all methods are: ibm2, teg_eign, matssvaedi_samein, svfn, kdagur.

{r, warning=F, message=F} #Create new data set with the variables above data.small2 <- subset(data.small, select = c(kdagur, nuvirdi, teg_eign, svfn, ibm2, matssvaedi_samein, fjsturt, lyfta, byggar)) set.seed(3)

Create training & test set

ind2 <- sample(nrow(data.small2),nrow(data.small2)/2) train2 <- data.small[ind2,] test2 <- data.small[-ind2,]

Random Forest

rf2 =randomForest(nuvirdi~.,data=train2, mtry=2, importance =TRUE) rf.pred2 = predict(rf2,newdata =test) mean((rf.pred2-test2$nuvirdi)^2)

\(~\)

Predicting values of a categorical variable

Like mentioned earlier, the regression tree earlier split the areas (matssvaedi) into two groups by price:

Low price areas:

  • AustAusturRvk,
  • AusturRvk,
  • Álftanes,
  • Árbær,
  • Breiðholt,
  • Grafarvogur/Grafarholt,
  • Hafnarfjörður,
  • Kópavogur,
  • MosfellsbærHelgafell,
  • MosfellsbærVest/Kjalarnes

High price areas:

  • GarðabærNorður,
  • GarðabærSuður,
  • MiðbærRvk,
  • Seltjarnarnes,
  • VesturbærRvk

We start with creating a new binary variable that splits areas into these two categories.

data.small <- mutate(data.small, matssvaedi_cat = fct_recode(matssvaedi_samein, "0"="AustAusturRvk",
                                                             "0"="AusturRvk",
                                                             "0"="Álftanes",
                                                             "0"="Árbær",
                                                             "0"="Breiðholt",
                                                             "0"="Grafarvogur/Grafarholt",
                                                             "0"="Hafnarfjörður",
                                                             "0"="Kópavogur",
                                                             "0"="MosfellsbærHelgafell",
                                                             "0"="MosfellsbærVest/Kjalarnes",
                                                             "1"="GarðabærNorður",
                                                             "1"="GarðabærSuður",
                                                             "1"="MiðbærRvk",
                                                             "1"="Seltjarnarnes",
                                                             "1"="VesturbærRvk"))
summary(data.small$matssvaedi_cat)
##     0     1 
## 27777  7329
data.small$matssvaedi_cat <- as.numeric(data.small$matssvaedi_cat)

# Create a new dataset without connected variables (like matssvaedi_samein)
data.cat <- subset(data.small, select = c(kdagur, nuvirdi, teg_eign, byggar, fjmib, lyfta, ibm2, fjbkar, fjsturt, fjeld, fjstof, fjgeym, svalm2, geymm2, matssvaedi_cat))
#Make train and test datasets
train.cat <- data.cat[ind,] 
test.cat <- data.cat[-ind,]

There are 27.777 properties in the “Low price areas” and 7329 properties in the “High price areas”.

\(~\)

Decision tree

# Create a decision tree model
tree.cat <- rpart(matssvaedi_cat~., data = train.cat, cp=.02)
summary(tree.cat)
## Call:
## rpart(formula = matssvaedi_cat ~ ., data = train.cat, cp = 0.02)
##   n= 17553 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.12415848      0 1.0000000 1.0001523 0.01091040
## 2 0.02147231      1 0.8758415 0.8760588 0.01135014
## 3 0.02000000      2 0.8543692 0.8638745 0.01124887
## 
## Variable importance
##   byggar    lyfta teg_eign 
##       84       15        1 
## 
## Node number 1: 17553 observations,    complexity param=0.1241585
##   mean=1.207144, MSE=0.1642354 
##   left son=2 (16547 obs) right son=3 (1006 obs)
##   Primary splits:
##       byggar  < 1939.5  to the right, improve=0.124158500, (0 missing)
##       nuvirdi < 76603   to the left,  improve=0.019412110, (0 missing)
##       lyfta   splits as  LLLRRRLL,    improve=0.016717930, (0 missing)
##       fjbkar  < 0.5     to the right, improve=0.013143810, (0 missing)
##       fjsturt < 0.5     to the left,  improve=0.008275642, (0 missing)
##   Surrogate splits:
##       teg_eign splits as  LRLRLL,      agree=0.943, adj=0.012, (0 split)
##       ibm2     < 26.35   to the right, agree=0.943, adj=0.003, (0 split)
##       nuvirdi  < 186820  to the left,  agree=0.943, adj=0.002, (0 split)
##       geymm2   < 758.5   to the left,  agree=0.943, adj=0.001, (0 split)
## 
## Node number 2: 16547 observations,    complexity param=0.02147231
##   mean=1.171934, MSE=0.142373 
##   left son=4 (15895 obs) right son=5 (652 obs)
##   Primary splits:
##       lyfta   splits as  LLLRRRLL,    improve=0.026275440, (0 missing)
##       nuvirdi < 45230.5 to the left,  improve=0.025888620, (0 missing)
##       byggar  < 2013.5  to the left,  improve=0.012171320, (0 missing)
##       fjsturt < 0.5     to the left,  improve=0.008937531, (0 missing)
##       fjbkar  < 0.5     to the right, improve=0.007275092, (0 missing)
## 
## Node number 3: 1006 observations
##   mean=1.786282, MSE=0.1680424 
## 
## Node number 4: 15895 observations
##   mean=1.159547, MSE=0.1340918 
## 
## Node number 5: 652 observations
##   mean=1.473926, MSE=0.2493202
# Visualize the decision tree with rpart.plot
rpart.plot(tree.cat, box.palette="RdBu", shadow.col="gray", nn=TRUE)

# Test on test set
tree.pred = predict(tree.cat, test.cat)
(err.tree <- mean((tree.pred-test.cat$matssvaedi_cat)^2))
## [1] 0.1391585

Results:

The variable byggar seems to matter most, that is, in recent years, more properties have been built in the “Low price areas” (because they are newer - see subchapter “Byggar ~ location (combined)”). There also seem to be different number of elevators in properties depending on our variable (most likely also linked to age (byggar), newer properties are likely to have more elevators than older ones).

We obtain a test MSE of roughly 0.14.

\(~\)

Random forests

{r, warning=F, message=F} rf.fit <- randomForest(matssvaedi_cat~., data=train.cat, mtry=2) rf.probs <- predict(rf.fit, newdata=test.cat) rf.pred <- ifelse(rf.probs>0.5, “1”, “0”) table(test.cat$matssvaedi_cat, rf.pred)

Don’t think we can use random forests (Warning message: The response has 5 or fewer unique values, are you sure you want to do regression?)

\(~\)

LDA

lda.fit <- lda(matssvaedi_cat~., data=train.cat)
lda.fit
## Call:
## lda(matssvaedi_cat ~ ., data = train.cat)
## 
## Prior probabilities of groups:
##         1         2 
## 0.7928559 0.2071441 
## 
## Group means:
##   kdagur2013 kdagur2014 kdagur2015 kdagur2016 kdagur2017 kdagur2018
## 1  0.1414098  0.1512539  0.1832291  0.1983186  0.1796364 0.02134081
## 2  0.1361386  0.1501650  0.1853685  0.1900440  0.1771177 0.02557756
##    nuvirdi teg_eignapartment building teg_eignapartment
## 1 35257.19               0.0001437091         0.8217288
## 2 42610.20               0.0068756876         0.8217822
##   teg_eignillegal apartment teg_eigntwo-family building teg_eigntown-house
## 1              0.0007185457                  0.02356830         0.06646547
## 2              0.0046754675                  0.01320132         0.05308031
##     byggar    fjmib    lyfta1     lyfta2     lyfta3      lyfta4
## 1 1983.237 1.003665 0.1524754 0.06639362 0.01803550 0.004526838
## 2 1971.181 1.010726 0.1193619 0.05775578 0.05143014 0.016501650
##        lyfta5      lyfta6     lyfta8     ibm2    fjbkar   fjsturt    fjeld
## 1 0.002083782 0.006035784 0.00172451 105.3660 0.7965797 0.5338076 1.004886
## 2 0.017051705 0.000000000 0.00000000 103.3867 0.6782178 0.6641914 1.009626
##     fjstof    fjgeym   svalm2   geymm2
## 1 1.226557 0.6109075 247.4746 272.3390
## 2 1.280253 0.5552805 215.1672 254.5388
## 
## Coefficients of linear discriminants:
##                                       LD1
## kdagur2013                  -2.962908e-01
## kdagur2014                  -5.469451e-01
## kdagur2015                  -7.822006e-01
## kdagur2016                  -1.208779e+00
## kdagur2017                  -1.907400e+00
## kdagur2018                  -1.798832e+00
## nuvirdi                      9.870398e-05
## teg_eignapartment building   2.820009e+00
## teg_eignapartment            7.233430e-01
## teg_eignillegal apartment    2.681230e+00
## teg_eigntwo-family building  1.025169e-01
## teg_eigntown-house           4.525220e-01
## byggar                      -2.907267e-02
## fjmib                       -4.825949e-01
## lyfta1                       3.220267e-01
## lyfta2                       2.165095e-01
## lyfta3                       1.314860e+00
## lyfta4                       1.744920e+00
## lyfta5                       2.655697e+00
## lyfta6                      -1.251785e+00
## lyfta8                      -6.024940e-01
## ibm2                        -2.256004e-02
## fjbkar                      -2.405085e-01
## fjsturt                      1.805437e-01
## fjeld                        1.950035e-01
## fjstof                      -1.040952e-01
## fjgeym                      -1.408140e-01
## svalm2                      -4.350672e-05
## geymm2                      -2.614400e-04
lda.pred <- predict(lda.fit, test.cat)
table(data=lda.pred$class, prediction=test.cat$matssvaedi_cat)
##     prediction
## data     1     2
##    1 13351  2429
##    2   509  1264
(err.LDA <- mean(lda.pred$class!=test.cat$matssvaedi_cat))
## [1] 0.1673788

Results:

LDA with the the same variables as in the decision tree gives a test MSE of 0.17.

\(~\)

Lasso

#Creata a matrix for the model fit and a lambda grid
x <- model.matrix(matssvaedi_cat~., data.cat)
y <- data.cat$matssvaedi_cat
lambda <- 10^seq(10, -2, length = 100)

#Split the data into training and validation sets
set.seed(1)
train.l = sample(nrow(x), nrow(x)/2)
test.l = (-train.l)
ytest = y[test.l]

#Perform cross-validation to find the optimal lambda value for lasso testing and plot lambda vs. MSE
cv.out <- cv.glmnet(x[train.l,], y[train.l], alpha = 1)
(bestlam <- cv.out$lambda.min)
## [1] 0.0001131944
plot(cv.out)

#Fit a lasso model using the training data set and predict the best model using the test set. 
lasso.mod <- glmnet(x[train.l,], y[train.l], alpha = 1, lambda = bestlam)
lasso.pred <- predict(lasso.mod, s = bestlam, newx = x[test.l,])

#Calculage the MSE
(err.lasso <- mean((lasso.pred-ytest)^2))
## [1] 0.1294638
#Find the coefficients for the best model defined by lasso.
lasso.coef  <- predict(lasso.mod, type = 'coefficients', s = bestlam)
lasso.coef %>% tidy()
##                            row column         value
## 1                  (Intercept)      1  1.085610e+01
## 2                   kdagur2013      1 -2.802122e-02
## 3                   kdagur2014      1 -7.507919e-02
## 4                   kdagur2015      1 -1.121351e-01
## 5                   kdagur2016      1 -1.831740e-01
## 6                   kdagur2017      1 -3.080610e-01
## 7                   kdagur2018      1 -2.956879e-01
## 8                      nuvirdi      1  1.660170e-05
## 9   teg_eignapartment building      1  5.161474e-01
## 10           teg_eignapartment      1  1.298891e-01
## 11   teg_eignillegal apartment      1  3.420785e-01
## 12 teg_eigntwo-family building      1  2.430116e-02
## 13          teg_eigntown-house      1  7.507783e-02
## 14                      byggar      1 -4.908401e-03
## 15                       fjmib      1 -9.812785e-02
## 16                      lyfta1      1  5.042063e-02
## 17                      lyfta2      1  3.858494e-02
## 18                      lyfta3      1  2.436021e-01
## 19                      lyfta4      1  3.242303e-01
## 20                      lyfta5      1  4.245498e-01
## 21                      lyfta6      1 -2.036992e-01
## 22                      lyfta8      1 -9.996877e-02
## 23                        ibm2      1 -3.805649e-03
## 24                      fjbkar      1 -4.064817e-02
## 25                     fjsturt      1  3.191027e-02
## 26                       fjeld      1  1.646319e-02
## 27                      fjstof      1 -1.504679e-02
## 28                      fjgeym      1 -1.606324e-02
## 29                      svalm2      1 -7.784295e-06
## 30                      geymm2      1 -4.104561e-05
coef(lasso.mod, s = bestlam) %>%
broom::tidy() %>%
filter(row != "(Intercept)") %>%
top_n(20, wt = abs(value)) %>%
ggplot(aes(value, reorder(row, value), color = value > 0)) +
geom_point(show.legend = FALSE) +
ggtitle("Top 20 influential variables (lasso penalty)") +
xlab("Coefficient") +
ylab(NULL) + theme_bw()

Results:

Lasso selects a model with with lambda = 0.0001131944 where the test MSE = 0.13.

The Lasso is optimal when around 25 variables are included.

The most influential variables are:

  • positive effect: teg_eign (apartment building, illegal apartment) and lyfta (5, 4, 3)
  • negative effect: kdagur (2017, 2018, 2016), lyfta (6)

\(~\)

General linear model

glm.fit <- glm(matssvaedi_cat~., data=train.cat)

tidy(glm.fit)
## # A tibble: 30 x 5
##    term                          estimate   std.error statistic   p.value
##    <chr>                            <dbl>       <dbl>     <dbl>     <dbl>
##  1 (Intercept)                 10.8       0.285           37.9  1.27e-302
##  2 kdagur2013                  -0.0497    0.0105          -4.74 2.20e-  6
##  3 kdagur2014                  -0.0918    0.0104          -8.80 1.47e- 18
##  4 kdagur2015                  -0.131     0.0102         -12.9  6.52e- 38
##  5 kdagur2016                  -0.203     0.0105         -19.4  6.24e- 83
##  6 kdagur2017                  -0.320     0.0117         -27.4  2.06e-162
##  7 kdagur2018                  -0.302     0.0207         -14.6  4.51e- 48
##  8 nuvirdi                      0.0000166 0.000000323     51.3  0.       
##  9 teg_eignapartment building   0.473     0.0708           6.68 2.39e- 11
## 10 teg_eignapartment            0.121     0.0132           9.18 4.60e- 20
## # … with 20 more rows
plot_coeffs <- function(glm.fit) {
coeffs <- coefficients(glm.fit)
mp <- barplot(coeffs, col="#3F97D0", xaxt='n', 
main="Regression Coefficients")
lablist <- names(coeffs)
text(mp, par("usr")[3], labels = lablist, srt = 45, 
adj = c(1.1,1.1), xpd = TRUE, cex=0.6)
}

plot_pval <- function(glm.fit) {
p_values <- coef(summary(glm.fit))[,4]
mp <- barplot(p_values, col="#3F97D0", xaxt='n', 
main="P-values")
lablist <- names(p_values)
text(mp, par("usr")[3], labels = lablist, srt = 45, 
adj = c(1.1,1.1), xpd = TRUE, cex=0.6)
}

plot_coeffs(glm.fit)

plot_pval(glm.fit)

glm.probs <- predict(glm.fit, test.cat, type="response")
glm.pred <- ifelse(glm.probs>0.5,"1","0")
table(glm.pred, test.cat$matssvaedi_cat)
##         
## glm.pred     1     2
##        0     4     0
##        1 13856  3693
(err.glm <- mean(glm.pred!=test.cat$matssvaedi_cat))
## [1] 0.2106193

Results:

GLM with the the same variables as before gives a test MSE of 0.21.

\(~\)

SVM

#svm.linear <- svm(matssvaedi_cat~., data=train.cat, kernel="linear", cost=10)

Results:

RStudio keeps freezing when trying to take the svm further - skip for now

\(~\)

Comparing models

err.tree
## [1] 0.1391585
err.LDA
## [1] 0.1673788
err.lasso
## [1] 0.1294638
err.glm
## [1] 0.2106193

Results:

The Lasso and the Decision tree performed best on the dataset with test MSE of around 0.14.

\(~\)

Interpretation: ???????????????????